
#####################################################################
##
##  Replication Package to:
##
##    Kevin Bryan and Jorge Guzman. 2023. "Entrepreneurial Migration". 
##           The Review of Economics and Statistics.
##
##
##
#####################################################################


##  Overview: Results are presented in the order they are introduced in the paper. First
##
##    Script must be run in order.
##
##

#clear environment
rm(list=ls())

#install required packages
list.of.packages <- c("stringr","lfe",  "readstata13",  "stargazer",  "ggplot2",  
                      "survival",  "ggpubr",  "permute",  "tidyr",  "Hmisc","dplyr",
                      "pastecs","binsreg","testthat","magrittr","ggrepel","remotes",
                      "lmtest","sandwich","utils","estimatr","grid","gridExtra","usmap","sf","maptools","rgdal")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

if(!(c("starpolishr") %in% installed.packages()[,"Package"])){
  remotes::install_github("ChandlerLutz/starpolishr")  
}

#setwd("C:/Users/jag2367/Dropbox/Bryan_Guzman_Migration/Final_Submission/Data_Appendix")

#Load utilities (also loads packages)
source("utils/utils.R")
source("utils/star_panel.R")
source("bryan_guzman_functions.R")
source("estimate_city_utility.R")


####  Parameters ######



#################################
##
## Figure 1: Create Map
## Note: Only outputs CSV, map was created in Tableau
##

migration.cbsa <- get_agg_cbsa() 

moves <- get_moves_in_out_from_migration_cbsa(migration.cbsa) %>%
  left_join(get_cbsa_names()) %>%
  filter(!(cbsa %in% data.bad_cbsa)) ##TO DO: remove this bad cbsas piece?


startup.births <- get_startup_births()
moves <- moves%>%
  left_join(startup.births)

pop <- get_cbsa_pop() 
moves <- moves%>%
  left_join(pop)

city.utils <- estimate_obs_and_recpi_utils(migration.cbsa)
moves <- city.utils %>% select(cbsa, city.rank, city.util,city.util.q) %>%
  inner_join(moves)


moves <- moves %>%
  filter(cbsa.name != "") %>%
  mutate(net.moves = moves.in - moves.out,
         net.moves.pop = net.moves / pop2010,
         eship.pop = city.DEobs / pop2010) %>%
  filter(log(city.util) > -10)


moves.csv <- moves %>%
  mutate(
    net_move_rate = log(moves.out/moves.in),
    log_firms = log(city.DEobs),
    log_firms_by_pop  =log(city.DEobs/pop2010)
  ) %>%
  filter(!is.na(log_firms))


write.csv(moves, "../tex/map2.csv")


cbsa_locations <- sf::st_read("./data/tl_2020_us_cbsa.shp")
names(cbsa_locations)
moves$str_cbsa = as.character(moves$cbsa)
moves_map <- moves %>% left_join(cbsa_locations,by=c("str_cbsa"="CBSAFP"))

t(moves_map %>% filter(grepl("San Jose",cbsa.name)))

options(digits=12)
moves_map$INTPTLAT = as.numeric(moves_map$INTPTLAT)
moves_map$INTPTLON = as.numeric(moves_map$INTPTLON)

moves_map$INTPTLAT[grepl("San Fran",moves_map$cbsa.name)] = 37.763658
moves_map$INTPTLON[grepl("San Fran",moves_map$cbsa.name)] = -122.427824



moves_map$INTPTLAT[grepl("San Jose",moves_map$cbsa.name)] = 37.397792
moves_map$INTPTLON[grepl("San Jose",moves_map$cbsa.name)] = -121.963953


if(.Platform$OS.type == "windows"){
    #Only works on windows

    moves_map = usmap::usmap_transform(moves_map,input_names = c("INTPTLON","INTPTLAT"), output_names=c("mapx","mapy"))
    moves_map$eship_point = log(moves_map$eship.pop)
    
    usmap::plot_usmap("states", labels = TRUE,label_color = "darkgrey") +
        geom_point(aes(x=mapx, y=mapy,size=pop2010*2,fill=eship_point), data=moves_map, colour="black",pch=21) +
        theme(legend.position="none") +
        scale_fill_gradient2(low="white",mid="white",high="darkred",midpoint = median(moves_map$eship_point))+
        ggtitle("A. New Firm Formation by CBSA")
    ggsave("../tex/BirthsMap.png",device="png", width=9, height=6, units="in")
    
    moves_map$movers_point = log(moves_map$moves.in/moves_map$moves.out)
    
    moves_map <- moves_map %>%
        arrange(movers_point) %>%
        mutate(movers_point2 = row_number()/nrow(moves_map))
    moves_map$movers_point2
    usmap::plot_usmap("states", labels = TRUE,label_color = "darkgrey") +
        geom_point(aes(x=mapx, y=mapy,size=pop2010*2,fill=movers_point2), data=moves_map, colour="black",pch=21) +
        theme(legend.position="none") +
        scale_fill_gradient2(low="white",mid = "white", high="darkred",midpoint = median(moves_map$movers_point2))+
        ggtitle("B. Net Startup Migration by CBSA")
    ggsave("../tex/NetMovesMap.png",device="png", width=9, height=6, units="in")
}

################################################
##
## Figure 2: Log move ratio vs startups per pop
##
##

moves.graph <- moves %>%
  mutate(log.eship.pop = log(eship.pop),
         log.net.moves = log(moves.in/moves.out))

library(ggrepel)
ggplot(data = moves.graph, aes(log.eship.pop, log.net.moves)) +
  geom_point(aes(size=pop2010), shape=1) +
  geom_smooth(method="lm") +  
  theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  xlab("Log Delaware Startups per Pop") + ylab("Log(Moves in / Moves out)") +
  geom_text_repel(aes(label=cbsa.name), size=3, color="grey40") 


ggsave("../tex/startups_vs_migration_rate.png",device="png", width=7, height=5, units="in")


################################################
##
## Figure 3: Migration by age
##
##

migration_data <- bg.fx.load_migration_data_by_firm(only_if_missing=TRUE)

migration_data$statecode = as.factor(migration_data$datastate)
migrants = migration_data %>% select(incyear, incdate, moveddate, statecode,time_to_move)
migrants$id = 1:nrow(migrants)

all_migrants = NA

for(i in 0:20) { 
  print(paste0("Iteration: ", i))
  new_migrants = data.frame(migrants)
  new_migrants$age = i
  all_migrants = rbind(all_migrants, new_migrants)
}


all_migrants <- all_migrants %>%
  mutate(
    has_moved = replace_na(time_to_move < 90 + (age+1)*365,0),
    just_moved = replace_na(time_to_move >= 90 + age*365 & time_to_move < 90 + (age+1)*365,0)
  )


all_migrants = all_migrants %>% filter((age+incyear)<=2014)
all_migrants$obs_age = all_migrants$age
haz.r2= felm(just_moved ~0+ as.factor(obs_age)|0|0|statecode+incyear, data=all_migrants)


coefs.r2 <- data.frame(coef(summary(haz.r2)))
coefs.r2 <- coefs.r2 %>% add_rownames(var="name") %>%  mutate(age=seq(n())-1)    %>% 
  rename(b=Estimate,sd=Cluster.s.e.) %>% select(b,sd,name,age) %>% 
  filter(grepl("factor",name)) %>% filter(age < 12)


ggplot(coefs.r2, aes(x=age, y=b)) + theme_bw()+
  geom_errorbar(aes(ymin=b+1.96*sd, ymax=b-1.96*sd), width=.2,colour="dodgerblue4") +
  geom_line(linetype="dotted",colour="gray50") + 
  geom_point(colour="chocolate3",size=3) + 
  ylab("P(Moves at age t)") + xlab("Firm age") + 
  geom_hline(yintercept = 0, color="gray10") +
  theme(plot.title = element_text(hjust = 0.5,face="bold"),text=element_text(size=14))

ggsave(file="../tex/age_moving_prob.png",device="png", width=5, height=5, units="in")




################################################
##
## Figure 4: Migration rate over time
##
##


migration_data <- bg.fx.load_migration_data_by_firm(only_if_missing=TRUE)

migration_by_year <- migration_data %>% 
  mutate(
    move5_corp = case_when(
      (is_corp==1) ~ as.integer(move5), 
      TRUE~ NA_integer_
    ),
    move5_patent = case_when(
      patent==1 ~ move5, TRUE~ NA_integer_
    ),
    move5_trademark = case_when(
      trademark==1 ~ move5, TRUE~ NA_integer_
    )
  ) %>%
  group_by(incyear) %>% 
  summarise(me = mean(move5), 
            me_corp= mean(move5_corp, na.rm=TRUE), 
            me_patent = mean(move5_patent, na.rm=TRUE),
            me_trademark = mean(move5_trademark, na.rm=TRUE),
  )

migration.pre_2010 <- migration_by_year[which(migration_by_year$incyear <= 2010),]
migration.post_2010 <- migration_by_year[which(migration_by_year$incyear > 2010),]

reg.pre_2010 <- lm(me ~ incyear, data=migration.pre_2010)
summary(reg.pre_2010, robust=T)

p1 = ggplot(aes(x = incyear, y = me), data = migration.pre_2010) +  theme_bw()+
  geom_bar(stat = "identity",colour="gray25",fill="darkgoldenrod1") + 
  geom_bar(aes(x=incyear, y=me),stat="identity", data=migration.post_2010, colour="gray25",fill="gray90") + 
  ylab("Share Moving") + xlab("Founding year") +
  ggtitle("Share of Firms that Move") +
  geom_vline(xintercept=2010.5, colour="gray20",linetype="dotted") +
  annotate("text",x=2008,y=.06,label="Complete\nEstimates") + 
  annotate("text",x=2013,y=.05,label="Truncated\nEstimates") + 
  theme(plot.title = element_text(hjust = 0.5,face="bold"),text=element_text(size=14))





p2<-  ggplot(aes(x = incyear, y = me_corp), data = migration.pre_2010) +  theme_bw()+
  geom_bar(stat = "identity",colour="gray25",fill="darkgoldenrod1") + 
  geom_bar(aes(x=incyear, y=me_corp),stat="identity", data=migration.post_2010, colour="gray25",fill="gray90") + 
  ylab("Share Moving") + xlab("Founding year") +
  ggtitle("Corporations") +
  geom_vline(xintercept=2010.5, colour="gray20",linetype="dotted") +
  annotate("text",x=2008,y=.06,label="Complete\nEstimates") + 
  annotate("text",x=2013,y=.05,label="Truncated\nEstimates") + 
  theme(plot.title = element_text(hjust = 0.5,face="bold"),text=element_text(size=14))

p3 <-  ggplot(aes(x = incyear, y = me_patent), data = migration.pre_2010) +  theme_bw()+
  geom_bar(stat = "identity",colour="gray25",fill="darkgoldenrod1") + 
  geom_bar(aes(x=incyear, y=me_patent),stat="identity", data=migration.post_2010, colour="gray25",fill="gray90") + 
  ylab("Share Moving") + xlab("Founding year") +
  ggtitle("Patent at Founding") +
  geom_vline(xintercept=2010.5, colour="gray20",linetype="dotted") +
  annotate("text",x=2008,y=.06,label="Complete\nEstimates") + 
  annotate("text",x=2013,y=.05,label="Truncated\nEstimates") + 
  theme(plot.title = element_text(hjust = 0.5,face="bold"),text=element_text(size=14))

p4 <-  ggplot(aes(x = incyear, y = me_trademark), data = migration.pre_2010) +  theme_bw()+
  geom_bar(stat = "identity",colour="gray25",fill="darkgoldenrod1") + 
  geom_bar(aes(x=incyear, y=me_trademark),stat="identity", data=migration.post_2010, colour="gray25",fill="gray90") + 
  ylab("Share Moving") + xlab("Founding year") +
  ggtitle("Trademark at Founding") +
  geom_vline(xintercept=2010.5, colour="gray20",linetype="dotted") +
  annotate("text",x=2008,y=.06,label="Complete\nEstimates") + 
  annotate("text",x=2013,y=.05,label="Truncated\nEstimates") + 
  theme(plot.title = element_text(hjust = 0.5,face="bold"),text=element_text(size=14))


#force a bug not plotting figure
devices=  dev.list()
if("RStudioGD" %in% names(devices)) {
    dev.set(devices[grep("RStudioGD", names(devices))])
}

ggarrange(p1,p2,p3, p4, ncol=2, nrow=2)
ggsave("../tex/migration_rate_over_time.png", device="png", width=12, units="in", height=6.5)


###########################################################
##
##
##  Figure: Migration Rates and Taxes
##
##
##


migration.cbsa <- get_agg_cbsa() 
city.utils <- estimate_obs_and_recpi_utils(migration.cbsa)
moves <- get_moves_in_out_from_migration_cbsa(migration.cbsa)  %>%
  mutate(net.moves = log(moves.in / moves.out))

migration.cbsa.llc <-  get_agg_cbsa(include_llcs=TRUE)

llc_moves <- get_moves_in_out_from_migration_cbsa(migration.cbsa.llc) %>%
  mutate(net.moves.llc = log(moves.in/moves.out))

data.external<-read.csv("./data/external_nonalbouy_185cbsas.csv") %>%
  filter(year == 2010)

names(moves)

new_moves <- moves %>%
  left_join(get_cbsa_pop(), by="cbsa") %>%
  left_join(llc_moves , by="cbsa")   %>%
  inner_join(data.external, by="cbsa")



a <- ggplot(aes(x=net.moves, y=tax.average_tax_rate_95th_perc), data=new_moves) +
  geom_point(aes(size=pop2010), shape = 1) +
  ggrepel::geom_text_repel(aes(label=cbsaname), size=3, color="grey40", max.overlaps=10)+
  geom_smooth(method=lm) + theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  xlab("Log(Moves In / Moves Out)") + 
  ylab("Personal Income Tax Rate at 95th Percentile") +
  ggtitle("Corporation Migration")
a

b<- ggplot(aes(x=net.moves.llc, y=tax.average_tax_rate_95th_perc), data=new_moves) +
  geom_point(aes(size=pop2010), shape = 1) +
  ggrepel::geom_text_repel(aes(label=cbsaname), size=3, color="grey40", max.overlaps=10)+
  geom_smooth(method=lm) + theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  xlab("Log(Moves In / Moves Out)") + 
  ylab("Personal Income Tax Rate at 95th Percentile") +
  ggtitle("LLC Migration")
b


ggarrange(a,b, ncol=2) 

ggsave("../tex/migration_rates_and_taxes.png",device="png", width=13, height=7, units="in")









################################################
##
## Table 1: Summary Statistics Data
##
##



migration.firms = bg.fx.load_migration_data_by_firm(only_if_missing=TRUE) %>%
  filter(!(datastate %in% c("MD"))) %>%
  filter(is_corp == 1)

migration.firms.llc = bg.fx.load_migration_data_by_firm(only_if_missing=TRUE, corp=FALSE) %>%
  filter(!(datastate %in% c("MD"))) %>%
  mutate(is_llc = 1-is_corp) %>%
  filter(is_llc == 1)

summary_stats.data = migration.firms  %>%
  select(incyear, ipo, acq, patent, trademark, move2, move5) 


summary_stats.data.llc = migration.firms.llc  %>%
  select(incyear, ipo, acq, patent, trademark, move2, move5) 

total_firms.corp <- sum(migration.firms$is_corp) 
total_firms.llc <- sum(migration.firms.llc$is_llc) 
total_firms <- total_firms.corp+total_firms.llc



state_flows.corp <- migration.firms %>%
  filter(is_corp == 1) %>%
  mutate(sourcestate=datastate) %>%
  group_by(sourcestate, deststate) %>%
  summarise(moves = sum(move5))


state_flows.llc <- migration.firms.llc %>%
  filter(is_corp == 0) %>%
  mutate(sourcestate=datastate) %>%
  group_by(sourcestate, deststate) %>%
  summarise(moves = sum(move5))


est_migration_rate.corp <- estimate_true_migration_rate(state_flows.corp,total_firms.corp, movevar="moves")
est_migration_rate.llc <- estimate_true_migration_rate(state_flows.llc,total_firms.llc,movevar="moves")


summary_stats.labels = c("Incorporation Year","IPO","Acquired","Patent","Trademark","Moves in 2 years", "Moves in 5 years")
migration.firms.N <- nrow(migration.firms)
migration.firms.llc.N <- nrow(migration.firms.llc)

panela <- stargazer(summary_stats.data, type="latex", 
                    title=paste0("Summary Statistics"),
                    covariate.labels = summary_stats.labels, summary.stat = c("mean","sd"))  




panelb <- stargazer(summary_stats.data.llc, type="latex", 
                    title=paste0("Summary Statistics for Corporations (N=",migration.firms.llc.N),
                    covariate.labels = summary_stats.labels, summary.stat = c("mean","sd"))  

library(starpolishr)
source("utils/star_panel.R")

outtex <- star_panel(panela, panelb, same.summary.stats = FALSE,
                     panel.names=c(paste0("Corporations (N=",migration.firms.N,")"),
                                   paste0("LLCs (N=",migration.firms.llc.N,")")
                     )
)

line_num = grep("\\\\end\\{tabular",outtex) -1
outtex <- star_insert_row(outtex, paste0("\\emph{Panel C: Estimated 5-year U.S. Migration Rates} \\\\"), insert.after = line_num)
outtex <- star_insert_row(outtex, paste0("Corporations & ",round(est_migration_rate.corp,3),"  \\\\"), insert.after = line_num+1)
outtex <- star_insert_row(outtex, paste0("LLCs & ",round(est_migration_rate.llc,3),"  \\\\"), insert.after = line_num+2)
outtex <- star_insert_row(outtex,"\\hline \\hline  \\\\" , insert.after = line_num+3)




fileconn <- file( "../tex/data_summary_stats.tex")
writeLines(outtex, fileconn)






################################################
##
## Table 3: List of Estimated City Utilities
##    This table is presented after Table 2 in the paper but must be estimated
##    before table 2 in the code


migration.cbsa  <- get_agg_cbsa(include_llcs=FALSE)
migration.cbsa.llc <-  get_agg_cbsa(include_llcs=TRUE)


migration.cbsa <- migration.cbsa %>% 
  filter(!(source.cbsa %in% bad.cbsas) & !(dest.cbsa %in% bad.cbsas))

migration.cbsa.llc <- migration.cbsa.llc %>% 
  filter(!(source.cbsa %in% bad.cbsas) & !(dest.cbsa %in% bad.cbsas))


## Estimate city utilities
city.utils <- estimate_obs_and_recpi_utils(migration.cbsa)

city.utils.llc <- estimate_obs_and_recpi_utils(migration.cbsa.llc) %>%
  select(cbsa, city.util, log.city.util, city.rank) %>%
  rename(
    "city.util.llc" = "city.util",
    "log.city.util.llc" = "log.city.util",
    "city.rank.llc" = "city.rank"
  )


city.utils.llc %>% anti_join(city.utils) %>%    left_join(get_cbsa_names(), by=c("cbsa"))



cities.dataset <- get_moves_in_out_from_migration_cbsa(migration.cbsa) %>%
  left_join(get_moves_in_out_from_migration_cbsa(migration.cbsa.llc), by=c("cbsa"), suffix=c("",".llc")) %>%
  left_join(get_cbsa_names(), by=c("cbsa")) %>%
  left_join(get_startup_births() %>% mutate(cbsa=as.numeric(cbsa)), by=c("cbsa")) %>%
  left_join(get_cbsa_pop(), by=("cbsa")) %>%
  select(-cbsa.name) %>%
  right_join(city.utils, by=("cbsa")) %>%
  left_join(city.utils.llc , by=("cbsa"))


citylist.all <- cities.dataset  


## Estimate migration CBSA values based on different areas.

get_city_utils <- function(year_range, cutoff_joint = 4, corp_or_llc = "corp") {
  csv_path = str_c("./data/cbsa_matrixCurrent",corp_or_llc,".",year_range,".csv")
  print(str_c("Loading csv from path: ", csv_path))
  cbsa_matrix <- bg.fx.load_migration_data_by_cbsa(filepath = csv_path)
  in_out <- get_moves_in_out_from_migration_cbsa(cbsa_matrix)
  utils <- estimate_city_utility(cbsa_matrix, cutoff_joint = cutoff_joint) %>%
    left_join(in_out)
  
  return (utils)
}


utils.0_2.corp <- get_city_utils("0-2", corp_or_llc =  "corp") %>% select(cbsa,city.rank) 
utils.2_5.corp <- get_city_utils("2-5", corp_or_llc =  "corp") %>% dplyr::select(cbsa,city.rank)

utils.0_2.llc <- get_city_utils("0-2", corp_or_llc =  "llc") %>% dplyr::select(cbsa,city.rank)
utils.2_5.llc <- get_city_utils("2-5", corp_or_llc =  "llc") %>% dplyr::select(cbsa,city.rank)


# Create the output city list

citylist <- rerank(citylist.all %>% filter(pop2010>1000000) ,also_llc = TRUE) %>%
  arrange(city.rank) %>%
  select(city.util, city.rank, cbsa,  cbsa.name, moves.in, moves.out, moves.in.llc, moves.out.llc,  city.rank.llc) %>%
  # left_join(utils.0_2.corp , by=c("cbsa"), suffix=c("",".0_2.corp")) %>%
  #  left_join(utils.2_5.corp , by=c("cbsa"), suffix=c("",".2_5.corp")) %>%
  mutate(moves.in = round(moves.in),
         moves.out = round(moves.out),
         moves.in.llc = round(moves.in.llc),
         moves.out.llc = round(moves.out.llc),
         city.util= round(log(city.util),4)
  )

# %>%    
#relocate(city.rank.0_2.corp,city.rank.2_5.corp, .before="city.rank.llc")  

#n = nrow(citylist)
#citylist$city.rank.0_2.corp[order(citylist$city.rank.0_2.corp)] = 1:n
#citylist$city.rank.2_5.corp[order(citylist$city.rank.2_5.corp)] = 1:n

citylist <- citylist%>%
  select(-cbsa) %>%
  rename(
    "\\makecell{Rank \\\\(age: 1-5)}" = "city.rank",
    #     "\\makecell{Rank \\\\(age: 1-2)}" = "city.rank.0_2.corp",
    #    "\\makecell{Rank \\\\(age: 3-5)}" = "city.rank.2_5.corp",
    "Log Utility"="city.util",
    "CBSA Name" = "cbsa.name",
    "Moves In" = "moves.in",
    "Moves Out" = "moves.out",
    "Moves In LLC" = "moves.in.llc",
    "Moves Out LLC" = "moves.out.llc",
    "\\makecell{LLC Rank \\\\(age: 1-5)}" = "city.rank.llc"
  )



#Output the Lists of Cities
citylist$`Log Utility` <- as.numeric(citylist$`Log Utility`)


outtex <- stargazer(citylist, summary=F, rownames = F, 
                    title="Estimated Utility for Large US Cities (Population over 1 million in 2010)")

outtex <- gsub("ccccccc","rrlrrrrrr", outtex)
outtex <- gsub("\\begin{tabular}", "\\resizebox{\\textwidth}{!}{\\begin{tabular}", outtex, fixed = T)
outtex <- gsub("\\textbackslash ", "\\", outtex, fixed = T)
outtex <- gsub("\\{", "{", outtex, fixed = T)
outtex <- gsub("\\}", "}", outtex, fixed = T)
outtex <- gsub("\\end{tabular}", "\\end{tabular}}", outtex, fixed = T)

writeLines(outtex,file("../tex/city_list.tex")) 





################################################
##
## Table 2: Summary Statistics Flows
##  NOTE: Table 3 (utility estimates) must be run first so that we apply a filter 
##        and print summary stats only on CBSAs for which we actually are eventually able to 
##        estimate utility.
##
##

agg_state <- bg.fx.load_migration_data_by_state() %>%
  filter(!(sourcestate %in% c("MD")) & 
           !(deststate %in% c("MD")))


state.flows <- data.frame(agg_state) %>% 
  select(move5.corp, move5.corp.nonzero, move5.llc ,move5.llc.nonzero) 


star.state = stargazer(state.flows,  type="latex", summary.stat=c("mean","sd", "N"), 
                       covariate.labels = c("Number of Corporation Movers","Number of Corporation Movers Cond. on $\\geq$ 1", "Number of LLC Movers","Number of LLC Movers Cond. on $\\geq$ 1"), 
                       title="Summary Statistics for Migrant Flows Data") 





cbsa.flows.corp <- bg.fx.load_migration_data_by_cbsa(include_llcs = FALSE) %>%
  select(source.cbsa, dest.cbsa, move5) %>%
  rename(move5.corp  = move5) 

cbsa.flows.llc <- bg.fx.load_migration_data_by_cbsa(include_llcs = TRUE) %>%
  select(source.cbsa, dest.cbsa, move5) %>%
  rename(move5.llc = move5)

cbsa.flows.withcbsas <- cbsa.flows.corp %>%
  full_join(cbsa.flows.llc, by=c("source.cbsa","dest.cbsa")) %>%
  filter((source.cbsa %in% citylist.all$cbsa) & (dest.cbsa %in% citylist.all$cbsa)) %>%
  filter(source.cbsa != dest.cbsa) %>%
  mutate(move5.corp = replace_na(move5.corp,0) ,
         move5.llc  = replace_na(move5.llc,0), 
         move5.corp.nonzero = ifelse(move5.corp < 1 , NA, move5.corp),
         move5.llc.nonzero = ifelse(move5.llc < 1, NA, move5.llc),
  ) 

cbsa.flows <- cbsa.flows.withcbsas %>%
  select(move5.corp, move5.corp.nonzero, move5.llc, move5.llc.nonzero)


star.cbsa = stargazer(cbsa.flows,   type="latex", summary.stat=c("mean","sd", "N"), covariate.labels = c("Number of Corporation Movers","Number of Corporation Movers Cond. on $\\geq$ 1", "Number of LLC Movers","Number of LLC Movers Cond. on $\\geq$ 1")
) 

outtex <- star_panel(star.state, star.cbsa, panel.names=c("State to State Migration Flows","CBSA to CBSA Migration Flows"),same.summary.stats=F)
outtex = star_insert_row(outtex, c("\\multicolumn{4}{p{.8\\columnwidth}}{\\footnotesize{}}"), insert.after=c(24))

writeLines(outtex,file("../tex/state_cbsa_summary_stats.tex"))






####################################################
##
##
## Table 4: Regression Table
##
##
year_range = "all"
get_city_utils <- function(year_range, cutoff_joint = 4, corp_or_llc = "corp") {
  csv_path = str_c("./data/cbsa_matrixCurrent",corp_or_llc,".",year_range,".csv")
  print(str_c("Loading csv from path: ", csv_path))
  cbsa_matrix <- bg.fx.load_migration_data_by_cbsa(filepath = csv_path)
  in_out <- get_moves_in_out_from_migration_cbsa(cbsa_matrix)
  utils <- estimate_city_utility(cbsa_matrix, cutoff_joint = cutoff_joint) %>%
    left_join(in_out)
  return (utils)
}

settings.corp_or_llc = "corp"

utils.0_5 <- get_city_utils("all", corp_or_llc =  settings.corp_or_llc) 
utils.0_1 <- get_city_utils("0-1", corp_or_llc =  settings.corp_or_llc) %>% select(cbsa,city.rank, log.city.util)
utils.0_2 <- get_city_utils("0-2", corp_or_llc =  settings.corp_or_llc) %>% select(cbsa,city.rank, log.city.util)
utils.2_5 <- get_city_utils("2-5", corp_or_llc =  settings.corp_or_llc) %>% select(cbsa,city.rank, log.city.util)



utils_reg <- utils.0_2 %>%
  inner_join(utils.2_5, by=c("cbsa"), suffix=c(".0_2",".2_5")) 

dim(utils_reg)

utils_all <- utils.0_5 %>%
  left_join(get_cbsa_names()) %>%
  left_join(get_cbsa_pop()) %>%
  left_join(get_startup_births())%>%
  mutate(births_per_pop = city.DEobs /pop2010 ,
         log_births_per_pop = log(births_per_pop),
         log_pop = log(pop2010)) %>%
  filter(!grepl("Towson, MD",cbsa.name))

dim(utils_all)

utils_all <- utils_all %>% inner_join(utils.0_2, by=c("cbsa"), suffix=c(".0_5",".0_2")) 

utils_all <- utils_all %>% inner_join(utils.2_5, by=c("cbsa"))
utils_all <- utils_all %>% inner_join(utils.0_1, by=c("cbsa"), suffix=c(".2_5",".0_1"))

names(utils_all)


utils_reg <- utils_reg %>%
  left_join(get_cbsa_names()) %>%
  left_join(get_cbsa_pop()) %>%
  left_join(get_startup_births())%>%
  mutate(births_per_pop = city.DEobs /pop2010 ,
         log_births_per_pop = log(births_per_pop),
         log_pop = log(pop2010)) %>%
  filter(!grepl("Towson, MD",cbsa.name))


new_hhi <- readstata13::read.dta13("./data/hhi.dta") %>%
  rename("cbsa"="msa")
data.external<-read.csv("./data/external_nonalbouy_185cbsas.csv")



regression_data.all <- utils_reg %>% 
  inner_join(data.external, by="cbsa") %>%
  filter(year == 2000) %>%
  arrange(-births_per_pop) %>%
  left_join(new_hhi,by=c("cbsa") )


data.albouy<- read.csv("./data/Albouy_185cbsas.csv") 

regression_data.all %>% anti_join(data.albouy, by=c("cbsa"))


regression_data.all <-   regression_data.all %>%
  left_join(data.albouy, by=c("cbsa"))

regression_data <- regression_data.all %>%
  arrange(-pop) %>%
  #filter(dplyr::row_number()<=50)
  filter(pop2010 > 500000)

reg_data_a <- regression_data %>% 
  select(births_per_pop , log_births_per_pop,tax.average_tax_rate_50th_perc, pop,cbsa,log.city.util.0_2, 
         cbsa.name,log_hhi2, patents_per_capita, tax.average_tax_rate_95th_perc, static_albouy.quality_of_life) %>%
  mutate(early = 1, later =0) %>%
  rename( "log.city.util" = "log.city.util.0_2")

reg_data_b <- regression_data %>% 
  select(births_per_pop , log_births_per_pop,tax.average_tax_rate_50th_perc, pop,cbsa,log.city.util.2_5, cbsa.name, 
         log_hhi2, patents_per_capita,tax.average_tax_rate_95th_perc, static_albouy.quality_of_life) %>%
  mutate(early = 0,later=1) %>%
  rename( "log.city.util" = "log.city.util.2_5")

reg_data_2 <- rbind(reg_data_a, reg_data_b) %>%
  mutate(log_patents_per_pop = log(patents_per_capita))


reg_data_2 <-reg_data_2[!grepl("Albany",reg_data_2$cbsa.name),]



#regression_weights = rep(1,length(reg_data_2$births_per_pop))
regression_weights = reg_data_2$births_per_pop
#regression_weights = reg_data_2$pop


reg1 = lm(log.city.util~log_births_per_pop*later, data=reg_data_2, weights=regression_weights)
reg2 = lm( log_births_per_pop~ log_hhi2  + log_patents_per_pop, data=reg_data_2, weights=regression_weights)
reg3 = lm( log.city.util~ log_hhi2 + later+  + log_hhi2:later , data=reg_data_2, weights=regression_weights)
reg4 = lm( log.city.util~  + later+ log_patents_per_pop +  log_patents_per_pop:later, data=reg_data_2, weights=regression_weights)
reg5 = lm(log_births_per_pop~tax.average_tax_rate_95th_perc, data=reg_data_2, weights=regression_weights)
reg6 = lm(log_births_per_pop~tax.average_tax_rate_50th_perc, data=reg_data_2, weights=regression_weights)
reg7 = lm(  log.city.util~ tax.average_tax_rate_95th_perc + later +tax.average_tax_rate_95th_perc:later, data=reg_data_2, weights=regression_weights)
reg8 = lm(  log.city.util~ tax.average_tax_rate_50th_perc + later +tax.average_tax_rate_50th_perc:later, data=reg_data_2, weights=regression_weights)




regression_covariate_labels = c("Growth Startups per Capita",
                                "Growth Startups per Capita $\\times$ Later Movers (Years 3-5)",
                                "Industry Concentration (HHI)",
                                "Industry Concentration (HHI) $\\times$ Later Movers (Years 3-5)",
                                "Patenting per Capita",
                                "Patenting per Capita $\\times$ Later Movers (Years 3-5)",
                                "Personal Income Tax (95th)",
                                "Personal Income Tax (95th) $\\times$ Later Movers (Years 3-5)",
                                "Personal Income Tax (50th)",
                                "Personal Income Tax (50th) $\\times$ Later Movers (Years 3-5)",
                                "Quality of Life Index",
                                "Quality of Life Index $\\times$ Later Movers (Years 3-5)"
                                
)

regression_variable_order = c("log_births_per_pop$","log_births_per_pop:later","^log_hhi2$","log_hhi.*","log_patents_per_pop$","log_patents.*",
                              "^tax.average_tax_rate_95th_perc$","tax.average_tax_rate_95th_perc.*",
                              "^tax.average_tax_rate_50th_perc$","tax.average_tax_rate_50th_perc.*",
                              "^static_albouy.quality_of_life$","static_albouy.quality_of_life.*")

out_tex = stargazer(reg1, reg2, reg3, reg4, reg5,reg6,reg7,reg8,
          se=starprep(reg1, reg2, reg3,reg4, reg5,reg6,reg7,reg8),
          star.cutoffs = c(.1, .05, .001), 
          column.labels = c("\\makecell{\\\\ \\emph{Migrant} \\\\ City Utility}",
                            "\\makecell{\\\\ City \\\\  Entrepreneurship}",
                            "\\makecell{\\\\ \\emph{Migrant} \\\\ City Utility}","\\makecell{\\\\ \\emph{Migrant} \\\\ City Utility}",
                            "\\makecell{\\\\ City \\\\ Entrepreneurship}","\\makecell{\\\\ City  \\\\ Entrepreneurship}",
                            "\\makecell{\\\\ \\emph{Migrant} \\\\ City Utility}","\\makecell{\\\\ \\emph{Migrant} \\\\ City Utility}"),
          dep.var.labels  = c("\\textbf{Baseline}",
                              "\\multicolumn{3}{|c|}{\\textbf{Nursery Cities}xxx","","}",
                              "\\multicolumn{4}{|c|}{\\textbf{Income Taxes}xxx","","","}"),
          multicolumn = F,
          covariate.labels   = regression_covariate_labels,
          order = regression_variable_order,
          keep = regression_variable_order,
          
         # 
          omit.stat=c("adj.rsq", "ser","f"),
          omit.table.layout = "n",
          omit=c("Constant"),
          float=FALSE)
out_tex= gsub(pattern = "xxx[ &]*",replacement = "",x = out_tex)

fileconn <- file( "../tex/section_5_table_city_util.tex")
writeLines(out_tex, fileconn)





################################################
##
## Figure : Migrant Utility by Age
##
##


rm(list=ls())

source("utils/utils.R")
source("utils/star_panel.R")
source("bryan_guzman_functions.R")
source("estimate_city_utility.R")
library(xtable)
library(dplyr)
library(conflicted)
conflict_prefer("select",winner="dplyr", quiet=FALSE)
conflict_prefer("filter",winner="dplyr", quiet=FALSE)
conflict_prefer("group_by",winner="dplyr", quiet=FALSE)
library(ggrepel)



## Figure 

## Estimate migration CBSA values based on different areas.
settings.corp_or_llc = "corp"
year_range = "all"
get_city_utils <- function(year_range, cutoff_joint = 4, corp_or_llc = "corp") {
  csv_path = str_c("./data/cbsa_matrixCurrent",corp_or_llc,".",year_range,".csv")
  print(str_c("Loading csv from path: ", csv_path))
  cbsa_matrix <- bg.fx.load_migration_data_by_cbsa(filepath = csv_path)
  in_out <- get_moves_in_out_from_migration_cbsa(cbsa_matrix)
  utils <- estimate_city_utility(cbsa_matrix, cutoff_joint = cutoff_joint) %>%
    left_join(in_out)
  return (utils)
}



utils.0_5 <- get_city_utils("all", corp_or_llc =  settings.corp_or_llc) 
utils.0_1 <- get_city_utils("0-1", corp_or_llc =  settings.corp_or_llc) %>% select(cbsa,city.rank, log.city.util)
utils.0_2 <- get_city_utils("0-2", corp_or_llc =  settings.corp_or_llc) %>% select(cbsa,city.rank, log.city.util)
utils.2_5 <- get_city_utils("2-5", corp_or_llc =  settings.corp_or_llc) %>% select(cbsa,city.rank, log.city.util)


utils_all <- utils.0_5 %>%
  left_join(get_cbsa_names()) %>%
  left_join(get_cbsa_pop()) %>%
  left_join(get_startup_births())%>%
  mutate(births_per_pop = city.DEobs /pop2010 ,
         log_births_per_pop = log(births_per_pop),
         log_pop = log(pop2010)) %>%
  filter(!grepl("Towson, MD",cbsa.name))

dim(utils_all)

utils_all <- utils_all %>% inner_join(utils.0_2, by=c("cbsa"), suffix=c(".0_5",".0_2")) 
utils_all <- utils_all %>% inner_join(utils.2_5, by=c("cbsa"))
utils_all <- utils_all %>% inner_join(utils.0_1, by=c("cbsa"), suffix=c(".2_5",".0_1"))


utils_all <- utils_all %>%
  arrange(-pop2010) %>%
  filter(dplyr::row_number()<=50)


utils_all <-utils_all[!grepl("Albany",utils_all$cbsa.name),]

dim(utils_all)

#plot_weights = rep(1,nrow(utils_all))
plot_weights = utils_all$births_per_pop


g0_5 <- ggplot(data = utils_all, aes(log.city.util.0_5, log_births_per_pop)) +
  geom_point(aes(size=births_per_pop), shape=1, color="darkgrey") +
  geom_smooth(method="lm", aes(weight=plot_weights)) +  
  theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  ylab("Log Delaware Startups per Pop") + xlab("City Utility for Migrants") +
  geom_text_repel(aes(label=cbsa.name), size=3, color="grey40") +
  ggtitle("A. Main Result: Firms Moving in Years 1 to 5")

g0_5

utils_all



g0_1 <- ggplot(data = utils_all, aes(log.city.util.0_1, log_births_per_pop)) +
  geom_point(aes(size=births_per_pop), shape=1) +
  geom_smooth(method="lm",aes(weight=plot_weights)) +  
  theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  ylab("Log Delaware Startups per Pop") + xlab("City Utility for Migrants") +
  geom_text_repel(aes(label=cbsa.name), size=3, color="grey40") +
  ggtitle("B. Firms Moving in Year 1")
g0_1

g0_2 <- ggplot(data = utils_all, aes(log.city.util.0_2, log_births_per_pop)) +
  geom_point(aes(size=births_per_pop), shape=1) +
  geom_smooth(method="lm",aes(weight=plot_weights)) +  
  theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  ylab("Log Delaware Startups per Pop") + xlab("City Utility for Migrants") +
  geom_text_repel(aes(label=cbsa.name), size=3, color="grey40") +
  ggtitle("C. Firms Moving in Year 1 or 2")
g0_2




g2_5 <- ggplot(data = utils_all, aes(log.city.util.2_5,log_births_per_pop)) +
  geom_point(aes(size=births_per_pop), shape=1) +
  geom_smooth(method="lm",  aes(weight=plot_weights)) +  
  theme_bw()+ theme(legend.position = "none") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
  ylab("Log Delaware Startups per Pop") + xlab("City Utility for Migrants") +
  geom_text_repel(aes(label=cbsa.name), size=3, color="grey40") +
  ggtitle("D. Firms Moving in Years 3 to 5")

g2_5



#dev.new(width=1200,height=800,noRStudioGD = TRUE, unit="px")
ggarrange(g0_5, g0_1, g0_2, g2_5, ncol=2, nrow=2)
ggsave(filename='../tex/Migrant_Utility_Across_by_Migration_Age.png', device = 'png', dpi = 900, width=15,height=10)#width=1100, height=800 ,units="px")



